{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "from requests_html import HTMLSession\n",
    "import selenium\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-84-7e10cfaa9771>:15: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 进入知网\n",
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南方学院'"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查是否为“中山大学南方学院”登录\n",
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击高级检索\n",
    "element = driver.find_element_by_id('highSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-142ED4C6A10ECBA6B1A26B59E771BF77',\n",
       " 'CDwindow-D2BE11C78CEAA95F118D785EE2E6D357']"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检查所有窗口信息\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-142ED4C6A10ECBA6B1A26B59E771BF77'"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 当前窗口\n",
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-90-704fdf3805c1>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 切换窗口\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击学术期刊\n",
    "element = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a/span')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 期刊来源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击专业检索\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/ul/li[4]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入关键词\n",
    "# AI_新媒体_query = '(TI=\"人工智能\" and SU=\"新媒体\") OR (TI=\"新媒体\" and SU=\"网络\") OR (TI=\"新媒体\" and SU=\"新媒体\")'\n",
    "query='SU=\"新媒体\" AND SU=\"互联网\"'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/textarea')\n",
    "element.clear()\n",
    "element.send_keys(query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击检索\n",
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'12,708'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 检索文章数量\n",
    "element = driver.find_element_by_xpath('//*[@id=\"countPageDiv\"]/span[1]/em')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更换页面文章数量\n",
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/div/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//li[@data-val=\"50\"]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 抓取页面信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网思维模式下的新媒体营销策略探索</td>\n",
       "      <td>郑昕</td>\n",
       "      <td>中小企业管理与科技(中旬刊)</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>互联网背景下民族高校辅导员思想引领能力结构分析及提升路径</td>\n",
       "      <td>崔洋洋; 康丽滢; 陈志新</td>\n",
       "      <td>办公自动化</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>“互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例</td>\n",
       "      <td>邵玉杰; 关淼; 王琦</td>\n",
       "      <td>现代商贸工业</td>\n",
       "      <td>2021-06-11</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“互联网+”背景下图书出版与新媒体融合发展的路径探析</td>\n",
       "      <td>刘芳</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>吕梁市县级融媒体中心发展建设探究</td>\n",
       "      <td>郭小芳</td>\n",
       "      <td>吕梁学院学报</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>互联网时代下邮轮旅游网络营销策略分析</td>\n",
       "      <td>杜峰帅; 石兴; 肖素雅; 宋林潇; 罗群</td>\n",
       "      <td>商讯</td>\n",
       "      <td>2021-06-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成</td>\n",
       "      <td>杨宗晓; 杨克</td>\n",
       "      <td>农业经济</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>基于“互联网+”背景的四川阿坝藏族羌族自治州普法工作创新研究</td>\n",
       "      <td>田裕婷; 索郎玉珍</td>\n",
       "      <td>科学咨询(科技·管理)</td>\n",
       "      <td>2021-06-04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>新媒体视角下的温州传统伴手礼品牌构建与推广研究——以手工艺品为例</td>\n",
       "      <td>陈瑶</td>\n",
       "      <td>现代营销(经营版)</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>“互联网+”背景下农产品新媒体营销方法</td>\n",
       "      <td>吉思</td>\n",
       "      <td>现代营销(经营版)</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》</td>\n",
       "      <td>高倩楠</td>\n",
       "      <td>中国测试</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>互联网+背景下博物馆文化创意产品研发路径</td>\n",
       "      <td>朱科宇; 彭静</td>\n",
       "      <td>今古文创</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>580.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>论我国网上理论阵地的传播态势与发展大势</td>\n",
       "      <td>王凤翔</td>\n",
       "      <td>湖南大学学报(社会科学版)</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>网络新媒体背景下大学生思想政治教育的发展新趋势</td>\n",
       "      <td>李文峰</td>\n",
       "      <td>湖北开放职业学院学报</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>28.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>基于分众传播视角下的地方媒体融合路径探析</td>\n",
       "      <td>宿晓伟</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>126.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>新媒体时代围棋文化传播面临的机遇与挑战</td>\n",
       "      <td>薛铭安</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>124.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例</td>\n",
       "      <td>李红妮</td>\n",
       "      <td>传媒</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>互联网背景下企业思政工作的信息化建设</td>\n",
       "      <td>高磊</td>\n",
       "      <td>公关世界</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>大数据背景下传统媒体突围策略分析</td>\n",
       "      <td>于佳</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>基于互联网时代的传统广播新媒体化的路径探讨</td>\n",
       "      <td>李东; 徐燕群</td>\n",
       "      <td>新闻传播</td>\n",
       "      <td>2021-05-23</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>企业工会实施“互联网+工会”工作模式的实践思考</td>\n",
       "      <td>周春丽</td>\n",
       "      <td>工会博览</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>让跟评成为地方新闻APP的胜负手</td>\n",
       "      <td>余淮</td>\n",
       "      <td>中国报业</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>互联网与新媒体</td>\n",
       "      <td>NaN</td>\n",
       "      <td>广播电视信息</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>137.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>“互联网+”背景下煤矿企业工会宣传创新对策</td>\n",
       "      <td>秦越</td>\n",
       "      <td>中小企业管理与科技(中旬刊)</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>新媒体内容的选择策略</td>\n",
       "      <td>胡亚兰</td>\n",
       "      <td>企业文明</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>广播电视技术与互联网技术的融合分析</td>\n",
       "      <td>刘颖</td>\n",
       "      <td>中国有线电视</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>互联网环境下大学生意识形态和思想政治教育研究</td>\n",
       "      <td>相菲</td>\n",
       "      <td>产业与科技论坛</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>43.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>融合媒体背景之下编辑转型之路</td>\n",
       "      <td>白绍华</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>浅谈博物馆的新媒体运用以及相关问题——以四川自贡恐龙博物馆为例</td>\n",
       "      <td>吴雨纱</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>试析融媒体平台发展现状和突破——以“中央厨房”为例</td>\n",
       "      <td>钱璟</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>88.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>新媒体语境下数据造假与治理策略</td>\n",
       "      <td>岳圆</td>\n",
       "      <td>记者摇篮</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>“互联网+”视域下甘肃少数民族文化数字化建设路径探析</td>\n",
       "      <td>张娅琼</td>\n",
       "      <td>科学咨询(教育科研)</td>\n",
       "      <td>2021-05-14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>网络经济下河南省科技型中小企业营销创新路径与策略</td>\n",
       "      <td>王炳刚</td>\n",
       "      <td>人才资源开发</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>81.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>大数据与新媒体时代医学期刊的办刊之路</td>\n",
       "      <td>焦骞; 刘卓; 董军杰; 张爱净</td>\n",
       "      <td>传媒论坛</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>媒体融合视域下城市广播电视报发展策略——以《淮安广播电视》为例</td>\n",
       "      <td>万晓莉</td>\n",
       "      <td>视听</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>53.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>新时代网络反腐的媒介化实践及其制度重构——以中纪委网站的反腐实践为例</td>\n",
       "      <td>董浩</td>\n",
       "      <td>传媒观察</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>86.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>新媒体时代高等艺术设计教育体系构建策略研究</td>\n",
       "      <td>李珂</td>\n",
       "      <td>艺术教育</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>网络“VACA”时代高校教师培训体系构建策略探析</td>\n",
       "      <td>梁琛</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>全媒体时代大学生理想信念教育提升路径——以滁州学院为例</td>\n",
       "      <td>陈沭文; 张晶晶</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>高校学生沟通机制探究——以北京某高校为例</td>\n",
       "      <td>何家唯; 张权</td>\n",
       "      <td>学校党建与思想教育</td>\n",
       "      <td>2021-05-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>后疫情时代改善与重塑国家形象的新媒体传播策略</td>\n",
       "      <td>匡文波; 马茜茜</td>\n",
       "      <td>新闻与写作</td>\n",
       "      <td>2021-05-05</td>\n",
       "      <td>NaN</td>\n",
       "      <td>475.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>促进曲艺网络传播 助力传统艺术繁荣</td>\n",
       "      <td>张天来</td>\n",
       "      <td>曲艺</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》</td>\n",
       "      <td>赵鹏</td>\n",
       "      <td>中国广播电视学刊</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>“三全”育人背景下高校“党建+”特色档案融入课程思政教育的研究</td>\n",
       "      <td>肖国圣</td>\n",
       "      <td>产业与科技论坛</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>乡村振兴视域下汕尾农产品新媒体营销的对策研究</td>\n",
       "      <td>吴丽文</td>\n",
       "      <td>市场周刊</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>The Influence of New Media on College Students...</td>\n",
       "      <td>Hui Ai</td>\n",
       "      <td>Journal of Contemporary Educational Research</td>\n",
       "      <td>2021-05-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>新媒体背景下高校招生营销策略研究</td>\n",
       "      <td>张婷婷</td>\n",
       "      <td>广西质量监督导报</td>\n",
       "      <td>2021-04-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>129.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>新时代高校网络思想政治教育创新发展逻辑理路</td>\n",
       "      <td>张培卫</td>\n",
       "      <td>高校辅导员学刊</td>\n",
       "      <td>2021-04-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>73.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>既抓增量又抓存量，防止改革“翻烧饼”</td>\n",
       "      <td>李社军</td>\n",
       "      <td>新闻潮</td>\n",
       "      <td>2021-04-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>新时期新闻信息高质量传播路径探讨</td>\n",
       "      <td>覃雅妮</td>\n",
       "      <td>新闻潮</td>\n",
       "      <td>2021-04-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0            1                                 互联网思维模式下的新媒体营销策略探索   \n",
       "1            2                       互联网背景下民族高校辅导员思想引领能力结构分析及提升路径   \n",
       "2            3              “互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例   \n",
       "3            4                         “互联网+”背景下图书出版与新媒体融合发展的路径探析   \n",
       "4            5                                   吕梁市县级融媒体中心发展建设探究   \n",
       "5            6                                 互联网时代下邮轮旅游网络营销策略分析   \n",
       "6            7                       互联网新媒体传播中农村职业培训脱贫致富基本内涵的逻辑生成   \n",
       "7            8                     基于“互联网+”背景的四川阿坝藏族羌族自治州普法工作创新研究   \n",
       "8            9                   新媒体视角下的温州传统伴手礼品牌构建与推广研究——以手工艺品为例   \n",
       "9           10                                “互联网+”背景下农产品新媒体营销方法   \n",
       "10          11            新媒体与高校思想政治教育研究——评《新媒体视域下高校思想政治教育的解读与重构》   \n",
       "11          12                               互联网+背景下博物馆文化创意产品研发路径   \n",
       "12          13                                论我国网上理论阵地的传播态势与发展大势   \n",
       "13          14                            网络新媒体背景下大学生思想政治教育的发展新趋势   \n",
       "14          15                               基于分众传播视角下的地方媒体融合路径探析   \n",
       "15          16                                新媒体时代围棋文化传播面临的机遇与挑战   \n",
       "16          17                    新媒体广告生态下互联网营销服务平台的赋能逻辑——以巨量引擎为例   \n",
       "17          18                                 互联网背景下企业思政工作的信息化建设   \n",
       "18          19                                   大数据背景下传统媒体突围策略分析   \n",
       "19          20                              基于互联网时代的传统广播新媒体化的路径探讨   \n",
       "20          21                            企业工会实施“互联网+工会”工作模式的实践思考   \n",
       "21          22                                   让跟评成为地方新闻APP的胜负手   \n",
       "22          23                                            互联网与新媒体   \n",
       "23          24                              “互联网+”背景下煤矿企业工会宣传创新对策   \n",
       "24          25                                         新媒体内容的选择策略   \n",
       "25          26                                  广播电视技术与互联网技术的融合分析   \n",
       "26          27                             互联网环境下大学生意识形态和思想政治教育研究   \n",
       "27          28                                     融合媒体背景之下编辑转型之路   \n",
       "28          29                    浅谈博物馆的新媒体运用以及相关问题——以四川自贡恐龙博物馆为例   \n",
       "29          30                          试析融媒体平台发展现状和突破——以“中央厨房”为例   \n",
       "30          31                                    新媒体语境下数据造假与治理策略   \n",
       "31          32                         “互联网+”视域下甘肃少数民族文化数字化建设路径探析   \n",
       "32          33                           网络经济下河南省科技型中小企业营销创新路径与策略   \n",
       "33          34                                 大数据与新媒体时代医学期刊的办刊之路   \n",
       "34          35                    媒体融合视域下城市广播电视报发展策略——以《淮安广播电视》为例   \n",
       "35          36                 新时代网络反腐的媒介化实践及其制度重构——以中纪委网站的反腐实践为例   \n",
       "36          37                              新媒体时代高等艺术设计教育体系构建策略研究   \n",
       "37          38                           网络“VACA”时代高校教师培训体系构建策略探析   \n",
       "38          39                        全媒体时代大学生理想信念教育提升路径——以滁州学院为例   \n",
       "39          40                               高校学生沟通机制探究——以北京某高校为例   \n",
       "40          41                             后疫情时代改善与重塑国家形象的新媒体传播策略   \n",
       "41          42                                  促进曲艺网络传播 助力传统艺术繁荣   \n",
       "42          43              互联网背景下大学公共英语教学效率提升研究——评《网络与新媒体专业英语教程》   \n",
       "43          44                    “三全”育人背景下高校“党建+”特色档案融入课程思政教育的研究   \n",
       "44          45                             乡村振兴视域下汕尾农产品新媒体营销的对策研究   \n",
       "45          46  The Influence of New Media on College Students...   \n",
       "46          47                                   新媒体背景下高校招生营销策略研究   \n",
       "47          48                              新时代高校网络思想政治教育创新发展逻辑理路   \n",
       "48          49                                 既抓增量又抓存量，防止改革“翻烧饼”   \n",
       "49          50                                   新时期新闻信息高质量传播路径探讨   \n",
       "\n",
       "                       作者                                            刊名  \\\n",
       "0                      郑昕                                中小企业管理与科技(中旬刊)   \n",
       "1           崔洋洋; 康丽滢; 陈志新                                         办公自动化   \n",
       "2             邵玉杰; 关淼; 王琦                                        现代商贸工业   \n",
       "3                      刘芳                                          文化产业   \n",
       "4                     郭小芳                                        吕梁学院学报   \n",
       "5   杜峰帅; 石兴; 肖素雅; 宋林潇; 罗群                                            商讯   \n",
       "6                 杨宗晓; 杨克                                          农业经济   \n",
       "7               田裕婷; 索郎玉珍                                   科学咨询(科技·管理)   \n",
       "8                      陈瑶                                     现代营销(经营版)   \n",
       "9                      吉思                                     现代营销(经营版)   \n",
       "10                    高倩楠                                          中国测试   \n",
       "11                朱科宇; 彭静                                          今古文创   \n",
       "12                    王凤翔                                 湖南大学学报(社会科学版)   \n",
       "13                    李文峰                                    湖北开放职业学院学报   \n",
       "14                    宿晓伟                                          传媒论坛   \n",
       "15                    薛铭安                                          传媒论坛   \n",
       "16                    李红妮                                            传媒   \n",
       "17                     高磊                                          公关世界   \n",
       "18                     于佳                                          中国报业   \n",
       "19                李东; 徐燕群                                          新闻传播   \n",
       "20                    周春丽                                          工会博览   \n",
       "21                     余淮                                          中国报业   \n",
       "22                    NaN                                        广播电视信息   \n",
       "23                     秦越                                中小企业管理与科技(中旬刊)   \n",
       "24                    胡亚兰                                          企业文明   \n",
       "25                     刘颖                                        中国有线电视   \n",
       "26                     相菲                                       产业与科技论坛   \n",
       "27                    白绍华                                        中国传媒科技   \n",
       "28                    吴雨纱                                        中国传媒科技   \n",
       "29                     钱璟                                        中国传媒科技   \n",
       "30                     岳圆                                          记者摇篮   \n",
       "31                    张娅琼                                    科学咨询(教育科研)   \n",
       "32                    王炳刚                                        人才资源开发   \n",
       "33       焦骞; 刘卓; 董军杰; 张爱净                                          传媒论坛   \n",
       "34                    万晓莉                                            视听   \n",
       "35                     董浩                                          传媒观察   \n",
       "36                     李珂                                          艺术教育   \n",
       "37                     梁琛                                        新闻研究导刊   \n",
       "38               陈沭文; 张晶晶                                        新闻研究导刊   \n",
       "39                何家唯; 张权                                     学校党建与思想教育   \n",
       "40               匡文波; 马茜茜                                         新闻与写作   \n",
       "41                    张天来                                            曲艺   \n",
       "42                     赵鹏                                      中国广播电视学刊   \n",
       "43                    肖国圣                                       产业与科技论坛   \n",
       "44                    吴丽文                                          市场周刊   \n",
       "45                 Hui Ai  Journal of Contemporary Educational Research   \n",
       "46                    张婷婷                                      广西质量监督导报   \n",
       "47                    张培卫                                       高校辅导员学刊   \n",
       "48                    李社军                                           新闻潮   \n",
       "49                    覃雅妮                                           新闻潮   \n",
       "\n",
       "          发表时间  被引     下载   操作  \n",
       "0   2021-06-15 NaN    NaN   下载  \n",
       "1   2021-06-15 NaN    NaN   下载  \n",
       "2   2021-06-11 NaN    NaN   下载  \n",
       "3   2021-06-10 NaN    NaN   下载  \n",
       "4   2021-06-07 NaN   26.0   下载  \n",
       "5   2021-06-05 NaN   73.0   下载  \n",
       "6   2021-06-04 NaN   82.0   下载  \n",
       "7   2021-06-04 NaN    6.0   下载  \n",
       "8   2021-06-01 NaN    4.0   下载  \n",
       "9   2021-06-01 NaN   20.0   下载  \n",
       "10  2021-05-31 NaN   16.0   下载  \n",
       "11  2021-05-28 NaN  580.0   下载  \n",
       "12  2021-05-28 NaN   31.0   下载  \n",
       "13  2021-05-28 NaN   28.0   下载  \n",
       "14  2021-05-25 NaN  126.0   下载  \n",
       "15  2021-05-25 NaN  124.0   下载  \n",
       "16  2021-05-25 NaN   60.0   下载  \n",
       "17  2021-05-25 NaN    9.0   下载  \n",
       "18  2021-05-25 NaN   54.0   下载  \n",
       "19  2021-05-23 NaN   10.0   下载  \n",
       "20  2021-05-20 NaN    NaN   下载  \n",
       "21  2021-05-15 NaN    8.0   下载  \n",
       "22  2021-05-15 NaN  137.0   下载  \n",
       "23  2021-05-15 NaN   32.0   下载  \n",
       "24  2021-05-15 NaN   31.0   下载  \n",
       "25  2021-05-15 NaN   27.0   下载  \n",
       "26  2021-05-15 NaN   43.0   下载  \n",
       "27  2021-05-15 NaN   11.0   下载  \n",
       "28  2021-05-15 NaN   23.0   下载  \n",
       "29  2021-05-15 NaN   88.0   下载  \n",
       "30  2021-05-15 NaN    NaN   下载  \n",
       "31  2021-05-14 NaN   26.0   下载  \n",
       "32  2021-05-10 NaN   81.0   下载  \n",
       "33  2021-05-10 NaN   34.0   下载  \n",
       "34  2021-05-10 NaN   53.0   下载  \n",
       "35  2021-05-10 NaN   86.0   下载  \n",
       "36  2021-05-10 NaN   14.0   下载  \n",
       "37  2021-05-10 NaN    7.0   下载  \n",
       "38  2021-05-10 NaN   10.0   下载  \n",
       "39  2021-05-08 NaN   69.0   下载  \n",
       "40  2021-05-05 NaN  475.0   下载  \n",
       "41  2021-05-01 NaN   18.0   下载  \n",
       "42  2021-05-01 NaN   21.0   下载  \n",
       "43  2021-05-01 NaN   38.0   下载  \n",
       "44  2021-05-01 NaN    1.0   下载  \n",
       "45  2021-05-01 NaN    NaN  NaN  \n",
       "46  2021-04-28 NaN  129.0   下载  \n",
       "47  2021-04-28 NaN   73.0   下载  \n",
       "48  2021-04-28 NaN    2.0   下载  \n",
       "49  2021-04-28 NaN    6.0   下载  "
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 抓取首页信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "含有页面主要数据的表格html_ = element.get_attribute('innerHTML')\n",
    "首页主要数据 = pd.read_html(含有页面主要数据的表格html_)[0]\n",
    "首页主要数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 翻页\n",
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n"
     ]
    }
   ],
   "source": [
    "# 选择前1000篇进行信息爬取\n",
    "pages = list(range(1,21))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 点击下一页\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 如果想爬更多，可以设置更长的停顿时间\n",
    "        # 有这一步可省略使用自动监测二维码的步骤\n",
    "        time.sleep(10+30*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        table_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "19  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "20  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([table_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "# 指定内容输出的位置\n",
    "fn = { \"output\" : { \"htm_snippets\": \"知网_htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存页面内容的csv文件\n",
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    table = pd.read_html(table_html[p])[0]\n",
    "    l_df.append(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网思维模式下的新媒体营销策略探索</td>\n",
       "      <td>郑昕</td>\n",
       "      <td>中小企业管理与科技(中旬刊)</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>互联网背景下民族高校辅导员思想引领能力结构分析及提升路径</td>\n",
       "      <td>崔洋洋; 康丽滢; 陈志新</td>\n",
       "      <td>办公自动化</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>“互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例</td>\n",
       "      <td>邵玉杰; 关淼; 王琦</td>\n",
       "      <td>现代商贸工业</td>\n",
       "      <td>2021-06-11</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“互联网+”背景下图书出版与新媒体融合发展的路径探析</td>\n",
       "      <td>刘芳</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>吕梁市县级融媒体中心发展建设探究</td>\n",
       "      <td>郭小芳</td>\n",
       "      <td>吕梁学院学报</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1046</td>\n",
       "      <td>“互联网+时代”助推鲜花市场再升级</td>\n",
       "      <td>刘婉秋</td>\n",
       "      <td>商讯</td>\n",
       "      <td>2020-04-13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>370.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1047</td>\n",
       "      <td>“互联网+”背景下高职院校后勤育人模式分析</td>\n",
       "      <td>沈兴木</td>\n",
       "      <td>智库时代</td>\n",
       "      <td>2020-04-13</td>\n",
       "      <td>2.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1048</td>\n",
       "      <td>浅析“互联网+”时代省级少儿公共图书馆的特色阅读推广</td>\n",
       "      <td>刘映潇</td>\n",
       "      <td>文化创新比较研究</td>\n",
       "      <td>2020-04-11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1049</td>\n",
       "      <td>移动互联网时代新媒体用户的认知图景——以百度百家号的调研分析结论为例(1)</td>\n",
       "      <td>喻国明学术工作室; 刘淼; 韩婷</td>\n",
       "      <td>传媒观察</td>\n",
       "      <td>2020-04-10 11:05</td>\n",
       "      <td>2.0</td>\n",
       "      <td>299.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1050</td>\n",
       "      <td>网络互动化教学模式在现代思政课程教育改革中的应用——评《新媒体视角下大学生思政教育创新探索》</td>\n",
       "      <td>姚湘莲; 向琳丽</td>\n",
       "      <td>化学试剂</td>\n",
       "      <td>2020-04-10</td>\n",
       "      <td>6.0</td>\n",
       "      <td>642.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1050 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                              篇名  \\\n",
       "0             1                              互联网思维模式下的新媒体营销策略探索   \n",
       "1             2                    互联网背景下民族高校辅导员思想引领能力结构分析及提升路径   \n",
       "2             3           “互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例   \n",
       "3             4                      “互联网+”背景下图书出版与新媒体融合发展的路径探析   \n",
       "4             5                                吕梁市县级融媒体中心发展建设探究   \n",
       "..          ...                                             ...   \n",
       "995        1046                               “互联网+时代”助推鲜花市场再升级   \n",
       "996        1047                           “互联网+”背景下高职院校后勤育人模式分析   \n",
       "997        1048                      浅析“互联网+”时代省级少儿公共图书馆的特色阅读推广   \n",
       "998        1049           移动互联网时代新媒体用户的认知图景——以百度百家号的调研分析结论为例(1)   \n",
       "999        1050  网络互动化教学模式在现代思政课程教育改革中的应用——评《新媒体视角下大学生思政教育创新探索》   \n",
       "\n",
       "                   作者              刊名              发表时间   被引     下载  操作  \n",
       "0                  郑昕  中小企业管理与科技(中旬刊)        2021-06-15  NaN    NaN  下载  \n",
       "1       崔洋洋; 康丽滢; 陈志新           办公自动化        2021-06-15  NaN    NaN  下载  \n",
       "2         邵玉杰; 关淼; 王琦          现代商贸工业        2021-06-11  NaN    NaN  下载  \n",
       "3                  刘芳            文化产业        2021-06-10  NaN    NaN  下载  \n",
       "4                 郭小芳          吕梁学院学报        2021-06-07  NaN   26.0  下载  \n",
       "..                ...             ...               ...  ...    ...  ..  \n",
       "995               刘婉秋              商讯        2020-04-13  NaN  370.0  下载  \n",
       "996               沈兴木            智库时代        2020-04-13  2.0   63.0  下载  \n",
       "997               刘映潇        文化创新比较研究        2020-04-11  1.0   58.0  下载  \n",
       "998  喻国明学术工作室; 刘淼; 韩婷            传媒观察  2020-04-10 11:05  2.0  299.0  下载  \n",
       "999          姚湘莲; 向琳丽            化学试剂        2020-04-10  6.0  642.0  下载  \n",
       "\n",
       "[1050 rows x 8 columns]"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表格 = 首页主要数据.append(df_url_out)\n",
    "df_总表格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>互联网思维模式下的新媒体营销策略探索</td>\n",
       "      <td>郑昕</td>\n",
       "      <td>中小企业管理与科技(中旬刊)</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>互联网背景下民族高校辅导员思想引领能力结构分析及提升路径</td>\n",
       "      <td>崔洋洋; 康丽滢; 陈志新</td>\n",
       "      <td>办公自动化</td>\n",
       "      <td>2021-06-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>“互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例</td>\n",
       "      <td>邵玉杰; 关淼; 王琦</td>\n",
       "      <td>现代商贸工业</td>\n",
       "      <td>2021-06-11</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>“互联网+”背景下图书出版与新媒体融合发展的路径探析</td>\n",
       "      <td>刘芳</td>\n",
       "      <td>文化产业</td>\n",
       "      <td>2021-06-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>吕梁市县级融媒体中心发展建设探究</td>\n",
       "      <td>郭小芳</td>\n",
       "      <td>吕梁学院学报</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1046</td>\n",
       "      <td>“互联网+时代”助推鲜花市场再升级</td>\n",
       "      <td>刘婉秋</td>\n",
       "      <td>商讯</td>\n",
       "      <td>2020-04-13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>370.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1047</td>\n",
       "      <td>“互联网+”背景下高职院校后勤育人模式分析</td>\n",
       "      <td>沈兴木</td>\n",
       "      <td>智库时代</td>\n",
       "      <td>2020-04-13</td>\n",
       "      <td>2.0</td>\n",
       "      <td>63.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1048</td>\n",
       "      <td>浅析“互联网+”时代省级少儿公共图书馆的特色阅读推广</td>\n",
       "      <td>刘映潇</td>\n",
       "      <td>文化创新比较研究</td>\n",
       "      <td>2020-04-11</td>\n",
       "      <td>1.0</td>\n",
       "      <td>58.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1049</td>\n",
       "      <td>移动互联网时代新媒体用户的认知图景——以百度百家号的调研分析结论为例(1)</td>\n",
       "      <td>喻国明学术工作室; 刘淼; 韩婷</td>\n",
       "      <td>传媒观察</td>\n",
       "      <td>2020-04-10 11:05</td>\n",
       "      <td>2.0</td>\n",
       "      <td>299.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1050</td>\n",
       "      <td>网络互动化教学模式在现代思政课程教育改革中的应用——评《新媒体视角下大学生思政教育创新探索》</td>\n",
       "      <td>姚湘莲; 向琳丽</td>\n",
       "      <td>化学试剂</td>\n",
       "      <td>2020-04-10</td>\n",
       "      <td>6.0</td>\n",
       "      <td>642.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1050 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                              篇名  \\\n",
       "0             1                              互联网思维模式下的新媒体营销策略探索   \n",
       "1             2                    互联网背景下民族高校辅导员思想引领能力结构分析及提升路径   \n",
       "2             3           “互联网+”背景下新媒体短视频对英语口语学习的影响研究——以抖音短视频为例   \n",
       "3             4                      “互联网+”背景下图书出版与新媒体融合发展的路径探析   \n",
       "4             5                                吕梁市县级融媒体中心发展建设探究   \n",
       "..          ...                                             ...   \n",
       "995        1046                               “互联网+时代”助推鲜花市场再升级   \n",
       "996        1047                           “互联网+”背景下高职院校后勤育人模式分析   \n",
       "997        1048                      浅析“互联网+”时代省级少儿公共图书馆的特色阅读推广   \n",
       "998        1049           移动互联网时代新媒体用户的认知图景——以百度百家号的调研分析结论为例(1)   \n",
       "999        1050  网络互动化教学模式在现代思政课程教育改革中的应用——评《新媒体视角下大学生思政教育创新探索》   \n",
       "\n",
       "                   作者              刊名              发表时间   被引     下载  操作  \n",
       "0                  郑昕  中小企业管理与科技(中旬刊)        2021-06-15  NaN    NaN  下载  \n",
       "1       崔洋洋; 康丽滢; 陈志新           办公自动化        2021-06-15  NaN    NaN  下载  \n",
       "2         邵玉杰; 关淼; 王琦          现代商贸工业        2021-06-11  NaN    NaN  下载  \n",
       "3                  刘芳            文化产业        2021-06-10  NaN    NaN  下载  \n",
       "4                 郭小芳          吕梁学院学报        2021-06-07  NaN   26.0  下载  \n",
       "..                ...             ...               ...  ...    ...  ..  \n",
       "995               刘婉秋              商讯        2020-04-13  NaN  370.0  下载  \n",
       "996               沈兴木            智库时代        2020-04-13  2.0   63.0  下载  \n",
       "997               刘映潇        文化创新比较研究        2020-04-11  1.0   58.0  下载  \n",
       "998  喻国明学术工作室; 刘淼; 韩婷            传媒观察  2020-04-10 11:05  2.0  299.0  下载  \n",
       "999          姚湘莲; 向琳丽            化学试剂        2020-04-10  6.0  642.0  下载  \n",
       "\n",
       "[1050 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 将内容表格存到本地\n",
    "with pd.ExcelWriter('知网文章数据爬取.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表格.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_总表格)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 导出文件"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第一批"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 经历过翻页以后 回去首页\n",
    "driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "# 导出refworks文件（.txt）和下载文章\n",
    "# 每次全选不能超过500篇，分2次进行\n",
    "\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "table_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_choose (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 点击下一页\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 如果想爬更多，可以设置更长的停顿时间\n",
    "        # 有这一步可省略使用自动监测二维码的步骤\n",
    "        time.sleep(10+30*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        table_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击导出文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"batchOpsBox\"]/li[2]/ul/li[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-142ED4C6A10ECBA6B1A26B59E771BF77',\n",
       " 'CDwindow-D2BE11C78CEAA95F118D785EE2E6D357',\n",
       " 'CDwindow-6CF6F0CE83AA676228E8A16EE029CCDD']"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-124-1c4eb9da559e>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换详细页面\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-126-fb509243f5d6>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换批量导出页面\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第一次批量下载文献全文（500篇）\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-128-797da17fb4ce>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换批量导出页面\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（500篇）\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-130-2b2093cf9c48>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换论文列表页\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 第二批"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-131-2b2093cf9c48>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换论文列表页\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除所选的文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(11,21))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_choose (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 点击下一页\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 如果想爬更多，可以设置更长的停顿时间\n",
    "        # 有这一步可省略使用自动监测二维码的步骤\n",
    "        time.sleep(10+30*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        table_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t"
     ]
    }
   ],
   "source": [
    "process_choose(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击导出文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"batchOpsBox\"]/li[2]/ul/li[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-142ED4C6A10ECBA6B1A26B59E771BF77',\n",
       " 'CDwindow-D2BE11C78CEAA95F118D785EE2E6D357',\n",
       " 'CDwindow-6CF6F0CE83AA676228E8A16EE029CCDD',\n",
       " 'CDwindow-A809745AF7E3BA6B740099D93D4D201B',\n",
       " 'CDwindow-BFB1D2F519FD2DA9F0C9CEAC90D16211']"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-140-1c4eb9da559e>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换详细页面\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-142-fb509243f5d6>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换批量导出页面\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 第二次批量下载文献全文（500篇）\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-144-797da17fb4ce>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换批量导出页面\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（500篇）\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
